import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import plotly.graph_objs as go
import plotly.express as px
import networkx as nx
from collections import defaultdict
from scipy.spatial import KDTree
from sklearn.linear_model import Perceptron
import scipy.stats as st
import numpy as np
import math
df = pd.read_csv('data.csv')
df.PATIENT_ID.fillna(method='ffill', inplace=True)
df['Admission time'] = pd.to_datetime(df['Admission time'])
df['Discharge time'] = pd.to_datetime(df['Discharge time'])
df['Duration of stay'] = df['Discharge time'] - df['Admission time']
df['Duration of stay'] = df['Duration of stay'].dt.days
df['Admission time'] = pd.to_datetime(df['Admission time']).values.astype(np.int64)
df['Discharge time'] = pd.to_datetime(df['Discharge time']).values.astype(np.int64)
df.insert(0, "days_from_admission", df["RE_DATE"])
df["days_from_admission"] = pd.to_datetime(df["days_from_admission"])
df["days_from_admission"] = pd.to_timedelta(df["days_from_admission"].values.astype(int) - df["Admission time"].values.astype(int))
df["days_from_admission"] = df["days_from_admission"].dt.days
cleaned_df = df.groupby("PATIENT_ID").mean()
class Graph:
def __init__(self, V):
self.V = V
self.adj = [[] for i in range(V)]
def DFSUtil(self, temp, v, visited):
visited[v] = True
temp.append(v)
for i in self.adj[v]:
if visited[i] == False:
temp = self.DFSUtil(temp, i, visited)
return temp
def addEdge(self, v, w):
self.adj[v].append(w)
self.adj[w].append(v)
def connectedComponents(self):
visited = []
cc = []
for i in range(self.V):
visited.append(False)
for v in range(self.V):
if visited[v] == False:
temp = []
cc.append(self.DFSUtil(temp, v, visited))
return cc
class CorrelationPlot:
def __init__(self,df,target='outcome',title = '',group_threshold=.9,edge_threshold=.7,correlated_columns=[],color_groups={},
groupped_attributes=[],color_pallete=px.colors.qualitative.D3*16,min_group_size = 3):
self.df = df
self.target = target
self.correlated_columns =correlated_columns
self.color_groups=color_groups
self.edge_threshold = edge_threshold
self.target_correlation =df.corrwith(df[target])
self.groupped_attributes = None
self.correlations_df = self.df.corr()
self.group_threshold = group_threshold
self.min_group_size = min_group_size
self.color_pallete = color_pallete
self.title = title
self.edge_trace = []
self.hover_info_trace = []
self.hover_text = None
self.node_trace = None
self.node_x = None
self.node_y = None
self.graph = None
self.pos = None
def plot(self):
self.get_groups()
self.get_hover_text()
self.create_graph()
self.create_figures()
self.get_plot()
def get_groups(self):
corr_df = self.df.corr().replace({1:0})
corr_df = corr_df.mask(abs(corr_df)>self.group_threshold,1)
corr_df = corr_df.mask(abs(corr_df) < 1,0)
corr_df = corr_df.drop(index='outcome', columns='outcome')
self.correlated_columns = corr_df.columns[(corr_df.T != 0).any()]
corr_df = corr_df.loc[self.correlated_columns,self.correlated_columns]
edges = list(zip(*np.where(corr_df[corr_df==1].to_numpy() == 1)))
g = Graph(len(self.correlated_columns))
for i in edges:
g.addEdge(*i)
groupped_attributes = g.connectedComponents()
groupped_attributes = [i for i in groupped_attributes if len(i) >=self.min_group_size]
self.groupped_attributes = groupped_attributes
for i in range(len(self.groupped_attributes)):
for k in range(len(self.groupped_attributes[i])):
self.groupped_attributes[i][k]=self.correlated_columns[self.groupped_attributes[i][k]]
groups = {}
for i in range(len(self.groupped_attributes)):
for name in self.groupped_attributes[i]:
groups[name] = self.color_pallete[i]
self.groups = groups
def get_hover_text(self):
res = defaultdict(list)
for key, val in sorted(self.groups.items()):
res[val].append(key)
for key,val in res.items():
length = len(max(val,key=len))
for i in range(len(val)):
val[i] = val[i].ljust(length+1,' ') + str(round(self.target_correlation[val[i]],4)).rjust(7,' ')
header = '<b>Attribute'.ljust(len(val[0])-4,' ')+f"CW{self.target[0].upper()}</b>"
val.insert(0,header)
self.hover_text = res
def create_graph(self):
G = nx.Graph()
for i in self.groupped_attributes:
for j in i:
G.add_node(j)
self.pos = nx.circular_layout(G)
for i in self.correlated_columns:
for j in self.correlated_columns:
if self.correlations_df[i][j] != 'nan' and i != j and abs(self.correlations_df[i][j]) > self.edge_threshold and abs(self.correlations_df[i][j]) < 1:
G.add_edge(i, j)
self.graph=G
@staticmethod
def divide_line(point1, point2, n_splits=3):
a_list = list()
points_to_split = [(point1, point2)]
a = 0
while len(points_to_split) != 0 and a != n_splits:
for g in range(0, len(points_to_split)):
pair_of_points_to_split = points_to_split.pop(0)
point1 = pair_of_points_to_split[0]
point2 = pair_of_points_to_split[1]
splitting_point = ((point1[0] + point2[0]) / 2, (point1[1] + point2[1]) / 2)
a_list.append(splitting_point)
points_to_split.append([point1, splitting_point])
points_to_split.append([splitting_point, point2])
a += 1
return a_list
def create_figures(self):
result = []
for edge in self.graph.edges:
try:
x0, y0 = self.pos[edge[0]]
x1, y1 = self.pos[edge[1]]
if self.groups[edge[0]] == self.groups[edge[1]]:
c = self.groups[edge[0]]
else:
c = 'black'
trace = go.Scatter(x = [x0, x1, None],
y = [y0, y1, None],
line = dict(width = (6*abs(self.correlations_df[edge[0]][edge[1]])**6),
color = c),
mode="lines",
line_shape='spline')
self.edge_trace.append(trace)
if c !='black':
text = '<br>'.join(self.hover_text[c])
points = self.divide_line((x0,y0), (x1,y1), 2)
for x,y in points:
self.hover_info_trace.append(self.get_info_marker(c,text,x,y))
except KeyError:
continue
self.node_x,self.node_y = list((zip(*self.pos.values())))
self.node_trace = go.Scatter(
x=self.node_x, y=self.node_y,
mode='markers',
hoverinfo='text',
marker=dict(
color = list(self.groups.values()),
size=20,
)
)
node_text = []
for node in self.graph.nodes():
node_text.append(node)
self.node_trace.text = node_text
def get_plot(self):
fig = go.Figure(data=[self.node_trace],
layout=go.Layout(
title=self.title,
titlefont_size=16,
showlegend=False,
hovermode='closest',
autosize=False,
width=1000,
height=900,
xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
)
shift = 40
for x,y,text in zip(self.node_x,self.node_y,self.node_trace.text):
angle = np.arcsin(y) * 180.0 / np.pi
s1,s2 = self.get_shift_sign(x,y)
if len(text)> 20:
temp = text.split()
text = temp[:math.floor(len(temp)/2)] +['<br>']+ temp[math.floor(len(temp)/2):]
text = ' '.join(text)
fig.add_annotation(x=x, y=y,
text=text,
showarrow=False,
yshift=shift*s1,
xshift=shift*s2)
for trace in self.edge_trace:
fig.add_trace(trace)
for info in self.hover_info_trace:
fig.add_trace(info)
fig.show()
@staticmethod
def get_shift_sign(x,y):
if abs(x) < 0.1:
if y > 0:
return [1,0]
else:
return [-1,0]
if abs(y) < 0.1:
if x > 0:
return [0,1.5]
else:
return [0,-1.5]
if x>=-.22 and y>=-.22:
return [1,1]
elif x<-.22 and y>-.22:
return [1,-1]
elif x<-.22 and y<-.22:
return [-1,-1]
elif x>-.22 and y<-.22:
return [-1,1]
@staticmethod
def get_info_marker(color,text,x,y):
return go.Scatter(
x=[x],
y=[y],
text=[text],
mode='markers',
hoverinfo='text',
marker=go.scatter.Marker(
opacity=0,
color =color
)
)
title = '<br>Attributes groupped by mutual corelation.<br>With their corelation with outcome (CWO).'
correlation_plot = CorrelationPlot(cleaned_df,title =title,color_pallete=px.colors.qualitative.D3,group_threshold=.7,edge_threshold=.5
)
correlation_plot.plot()
The graph represent correlation groups based on their mutual correlations (multicollinearity). Each of the colored edge represents correlation between the corresponding attributes. Colored edges are formed when the correlation between nodes is higher than a given threshold. Black edges represents existence of a weaker correlation. The wider the line the higher correlation, starting from 0.7 and 0.5 for colored and black respectively. Groups are formed by disconnected subgraphs obtained form depth first search performed on the correlation matrix. While hovering the mouse over colored lines one can see correlation between grouped attributes and target variable. The idea is to represent groups of similar attributes and choose the one which bring the most infromation. Based on Ockham's razor principle one should pursue the simplicity which relates also to aiming at the smallest number of attributes with highest incremental information. The graph informs that before training the model one should strongly consider choosing only one representative of each group in order to avoid infusing the model with noise.
def group_instances(df,attributes,radius):
X =np.array(list(df[attributes].itertuples(index=False, name=None)))
kdt = KDTree(X)
result = kdt.query_ball_point(X,r=radius)
lst = []
matrix = []
for i in result:
lst2 = []
for k in i:
if k not in lst:
lst2.append(k)
lst.append(k)
if len(lst2) != 0:
matrix.append(lst2)
result = pd.DataFrame([])
for i in matrix:
temp = df.iloc[i].mean()
temp['count'] =len(df.iloc[i])
result = result.append(temp.to_frame().T,ignore_index=True)
return result
def get_decision_line(df,x_attribute,y_attribute,target):
train_df =df[[x_attribute,y_attribute,target]].dropna()
model = Perceptron(class_weight='balanced',random_state=69)
X = train_df[[x_attribute,y_attribute]].to_numpy()
y = train_df[target].to_numpy()
model.fit(X, y)
w1,w2 = model.coef_[0]
b = model.intercept_[0]
a = (-(b / w2) / (b / w1))
intercept = (-b / w2)
x = 0
x1 = df[x_attribute].max()
y0= a*x + intercept
y1 = a*x1 + intercept
return x,y0,x1,y1,model,X,y
def plot_scatter(df,x_attribute,y_attribute,target,line_points):
fig = px.scatter(df, x="neutrophils count", y='(%)lymphocyte',size='count',
color='outcome',color_discrete_sequence=px.colors.qualitative.D3,marginal_x="box", marginal_y="box",)
fig.update_layout(
title_text=f"Influence of number of neutrophils and percentage of lymphocyte on the recovery",
shapes=[
dict(type="line", xref="x", yref="y",x0=line_points[0][0],
y0=line_points[0][1], x1=line_points[1][0], y1=line_points[1][1], line_width=2,line_color='#2CA02C'),
])
fig.show()
data = cleaned_df[['neutrophils count','(%)lymphocyte','outcome']].dropna()
died,survived = data[data.outcome==0].dropna(),data[data.outcome==1].dropna()
grouped_died = group_instances(died,['neutrophils count','(%)lymphocyte'],2)
grouped_survived = group_instances(survived,['neutrophils count','(%)lymphocyte'],2)
data = pd.concat([grouped_died,grouped_survived])
data['outcome'] = data['outcome'].values.astype(str)
data['outcome'] = data['outcome'].replace({'0.0':"survived","1.0":"died"})
x0,y0,x1,y1,model,X,y =get_decision_line(cleaned_df,"neutrophils count",'(%)lymphocyte','outcome')
plot_scatter(data,'neutrophils count','(%)lymphocyte','outcome',[(x0,y0),(x1,y1)])
The graph represents the impact of mean number of neutrophils and the mean percentage of lymphocyte on the recovery from the Covid. In the graph we can see distribution of both parameters implies nearly linear division between the dead and recovered. Thus this information is very valuable for the future model prediction. Even simple, single, untuned, trained on not normalized data, Perceptron is able to draw a linear decision boundary with 85% accuracy on the training data.
def quantitative_discretization(df,attribute,num_groups):
categories = pd.qcut(df[attribute],num_groups)
bins = pd.unique(categories).sort_values()
bins_dict = dict(zip(list(range(1,num_groups+1)),bins.astype(str)))
df[f'{attribute}_bins'] = pd.qcut(df[attribute],num_groups,labels=list(range(1,num_groups+1)))
bin_names = bins.astype(str)
return df,bin_names,f'{attribute}_bins'
def plot_violins(df,groupped_by,attribute,bin_names):
pointpos_1 = [-0.3,-0.3,-.3,-0.3,-0.3]
pointpos_2 = [0.4]*5
fig = go.Figure()
show_legend = [True,False,False,False]
for i in range(len(pd.unique(df[groupped_by]))):
fig.add_trace(go.Violin(x=df[groupped_by][(df['outcome'] == 1)
&(df[groupped_by] == pd.unique(df[groupped_by])[i])],
y=df[attribute][(df['outcome'] == 1)&
(df[groupped_by] == pd.unique(df[groupped_by])[i])],
legendgroup='1', scalegroup='1', name='died',
side='negative',
box_visible=True,
pointpos=pointpos_1[i-1],
line_color='#FF7F0E',
showlegend=show_legend[i-1]))
fig.add_trace(go.Violin(x=df[groupped_by][(df['outcome'] == 0)
&(df[groupped_by] == pd.unique(df[groupped_by])[i])],
y=df[attribute][(df['outcome'] == 0)&
(df[groupped_by] == pd.unique(df[groupped_by])[i])],
legendgroup='0', scalegroup='0', name='survived',
side='positive',
box_visible=True,
pointpos=pointpos_2[i-1],
line_color='#1F77B4',
showlegend=show_legend[i-1]))
fig.update_traces(meanline_visible=True,
points='all',
jitter=0.05,
scalemode='count')
fig.update_layout(
title_text=f"{attribute}<br><i>groupped by Age",
violingap=0, violingroupgap=0, violinmode='overlay',
yaxis_title=f"{attribute}",
xaxis_title="Age groups",
xaxis = dict(
tickmode = 'array',
tickvals = [1,2,3,4],
ticktext = bin_names
))
fig.show()
data,bin_names,column_name = quantitative_discretization(cleaned_df,'age',4)
fig = plot_violins(data,column_name,'eGFR',bin_names)
The graph represents eGFR distribution in the quantile-based age groups. We can observe that distribution of eGFR for the survivors is skewed to the higher values of eGFR in each group. Moreover, one can notice that this property vanishes for those who died. There is a clear downward trend of eGFR, which suggest that kidneys may gradually decrease their efficiency with age.
def get_mean_and_CI(df,attribute,inner_interval=0.5,outer_interval=.95):
means,outer_interval_list,inner_interval_list,days=[],[],[],[]
for idx, i in df.groupby('days_from_admission'):
a = i[attribute].dropna().to_numpy()
if len(a) >1:
days.append(idx)
means.append(a.mean())
outer_interval_list.append(st.t.interval(outer_interval, len(a)-1, loc=a.mean(), scale=st.sem(a)))
inner_interval_list.append(st.t.interval(inner_interval, len(a)-1, loc=a.mean(), scale=st.sem(a)))
oi1,oi2 = list(zip(*outer_interval_list))
ii1,ii2 = list(zip(*inner_interval_list))
return means,days,oi1,oi2,ii1,ii2
def plot_attribute_change_in_days(means,days,oi1,oi2,ii1,ii2,
attribute,inner_interval=0.5,outer_interval=.95 ):
fig = go.Figure()
fig.add_trace(go.Scatter(x=days, y=means,mode='lines',
name=f'mean value of {attribute}',opacity=0,showlegend=False,hoverinfo='skip',line_width=0))
fig.add_trace(go.Scatter(x=days,y=oi1,fill='tonexty',
mode='none',name = f'{outer_interval}% CI',fillcolor= 'rgba(255, 127, 14, 0.3)',legendgroup=f'{outer_interval}%'))
fig.add_trace(go.Scatter(x=days, y=means,mode='lines',
name=f'mean value of {attribute}',showlegend=False,opacity=0,hoverinfo='skip',line_width=0))
fig.add_trace(go.Scatter(x=days,y=oi2,fill='tonexty',
mode='none',name = f'{outer_interval}% CI',fillcolor= 'rgba(255, 127, 14, 0.3)',showlegend=False,legendgroup=f'{outer_interval}%'))
fig.add_trace(go.Scatter(x=days, y=means,mode='lines',
name=f'mean value of {attribute}',showlegend=False,opacity=0,hoverinfo='skip',line_width=0))
fig.add_trace(go.Scatter(x=days,y=ii1,fill='tonexty',
mode='none',name = f'{inner_interval}% CI',fillcolor= 'rgba(255, 127, 14, 0.5)',showlegend=False,legendgroup=f'{inner_interval}%'))
fig.add_trace(go.Scatter(x=days, y=means,
mode='lines',name=f'mean value of {attribute}',showlegend=False,opacity=0,hoverinfo='skip',line_width=0))
fig.add_trace(go.Scatter(x=days,y=ii2,fill='tonexty',
mode='none',name = f'{inner_interval}% CI',fillcolor= 'rgba(255, 127, 14, 0.5)',legendgroup=f'{inner_interval}%'))
fig.add_trace(go.Scatter(x=days, y=means,mode='lines',
name=f'mean value of {attribute}',marker_color='#1F77B4'))
fig.update_layout(hovermode="x unified",title_text=f"{attribute} in days from admission",
xaxis_title="Days from admission",yaxis_title=f"{attribute}")
fig.show()
plot_attribute_change_in_days(*get_mean_and_CI(df,'Eosinophil count'),'Eosinophil count')
In the graph we can observe than not only the mean number of eosinophils rose in the consecutive days but also the dispersion saw a significantly increase. Eosinophils become active when you have certain allergic diseases, infections, and other medical conditions. Up-word trend suggests that the more time one spends in the hospital the more likely he is to catch an infection or disease. Interestingly the growth of dispersion may represents doctor's incessant struggle to keep the spreading of the diseases in check.
def plot_paralell(df, color_by, names, attributes):
fig = go.Figure(data=
go.Parcoords(
line = dict(color = df[color_by], colorscale = [[0,'#1F77B4'],[1,'#FF7F0E']]),
dimensions = list([
dict(constraintrange = [40,80], label = n, values = df[attr])
if n == 'age' else dict(label = n, values = df[attr]) for n, attr in zip(names, attributes)])
)
)
layout=go.Layout(
title='<br>Parallel coordinates plot',
title_x = 0.5,
title_y = 1,
font_size=15
)
fig.show()
plot_paralell(cleaned_df, 'outcome',
['gender', 'age', '(%)lymphocyte', 'neutrophils(%)', 'Lactate dehydrogenase', 'C-protein', 'Total cholesterol', 'outcome'],
['gender', 'age', '(%)lymphocyte', 'neutrophils(%)', 'Lactate dehydrogenase', 'High sensitivity C-reactive protein', 'Total cholesterol', 'outcome'])
This plot depicts how different attributes correlate with the outcome (based on the color: orange for outcome = 1, blue for outcome = 0), as well as how they correlate with each other. For instance, we can see a strong negative correlation between (%)lymphocyte and neutrophils(%) - there is a "knot" between them. We can also notice that both Lactate dehydrogenase and High sensitivity C-reactive protein correlate with the outcome - almost in all cases when the outcome is 0, both of these attributes are low. In case of outcome = 1, both attributes get higher and lose the "bond" between them (we can no longer see a clean, paintbrush-like swipe). When it comes to age, we can observe that the older the person is, the greater chance that his outcome will be 1.
To better analyze the data, you can create and move a filter on every axis that will make only lines going through it colored (one was created by default on the age axis). You can also move every axis to a different place on the plot to analyze correlations between chosen attributes.
def bar_graph(df, frame_by, x_attr, attributes):
melted_df = df[[attr for attr in attributes]]
melted_df = pd.melt(melted_df, id_vars=[frame_by, x_attr]).dropna()
fig = px.histogram(melted_df, x=x_attr, y="value", color='variable', animation_frame=frame_by,
histfunc="avg", barmode="group", nbins=20, title="Chosen attributes in age groups depending on outcome",
range_x=[14,100], hover_data=melted_df.columns, color_discrete_sequence=px.colors.qualitative.D3)
fig.update_layout(
title_x = 0.5,
font_size = 16
)
fig.show()
bar_graph(df, 'outcome', 'age', ['age', 'outcome', 'Platelet count', 'Prothrombin activity', '(%)lymphocyte', 'Urea', 'D-D dimer'])
The bar chart represents how different attributes change based on the age group as well as on the outcome. When it comes to platelet count and prothrombin activity, we can notice that - on average - they are lower when the outcome is 1. A similar pattern is present for (%)lymphocyte attribute, however this pattern does not apply to the youngest people in database - for them the attribute is even higher in this case. We could put forward a hypothesis that the percentage of lymphocytes has less to do with the outcome the younger the person is, or treat this this outlier as just a deviation. When it comes to Urea and D-D dimer we can see a strong positive correlation with the outcome (especially in case of D-D dimer which grows significantly for outcome equal to 1).
You can uncheck the variables in the legend to better analyze ones with smaller values.
Having analyzed medical measurements for 375 patients with Covid-19 we were able to detect various interesting patterns. To begin with we found out that some of the attributes brings the same information as others and thus there is no need to analyze them. Then we have learned that having only measurements of number of neutrophils and the percentage of lymphocyte we are able to separate whether the patient died or survived with astonishing results. Furthermore, we visualized that Covid-19 has significant impact on the kidneys efficiency for patients above 62 years old. Then we posit a hypothesis as to why Eosinophils change in time.
When it comes to Lactate dehydrogenase and High sensitivity C-reactive protein we noticed that it is easy to determine the outcome of a single patient based mostly on these attributes - the higher they are, the greater are the chances for outcome equal to 1. There also exists a noticable inverse correlation between the age of a patient and his percentage of lymphocytes which indicates that older patients have weaker immune systems. In the case of people with outcome equal to 1, we have noticed a significant increase in the level of D-D dimer, which indicates that clots were present in the bloodstreams of these patients.